This is the third installment of Applying Machine Learning to Kaggle Datasets, a series of ipython notebooks demonstrating the methods described in the Stanford Machine Learning Course. In each noteobok, I apply one method taught in the course to an open kaggle competition.
In this notebook, I demonstrate the use of an artificial neural network to in the Titanic competition.
In [122]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import code.Neural_Net_Funcs as NNF
import neurolab as nl
In [123]:
reload(NNF)
Out[123]:
In [124]:
train = pd.read_csv("./data/titanic/train.csv", index_col="PassengerId")
train.head()
Out[124]:
In [125]:
#temp = pd.crosstab([train.Pclass, train.Sex],train.Survived.astype(bool))
#temp
In [126]:
#sb.set(style="white")
#sb.factorplot('Pclass','Survived','Sex',data=train,palette="muted")
#sb.factorplot('Embarked','Survived','Pclass',data=train,palette="muted")
#sb.factorplot('Embarked','Survived','Sex',data=train,palette="muted")
#fg = sb.FacetGrid(train,hue="Pclass",aspect=3,palette="muted")
#fg.map(sb.kdeplot,"Age",bw=4,shade=True,legend=True)
#fg.set(xlim=(0,80))
In [171]:
reload(NNF)
datain_age,dataout_age,min_max_list_age,pid = NNF.make_input_output(train)
datain,dataout,min_max_list,pid = NNF.make_input_output(train,Age=False)
In [172]:
print len(datain_age), len(datain)
In [172]:
In [172]:
In [173]:
# Get arguments to neurolab net, feed-forward network
# Create the net
# By default, all activation functions are the tangent function
# and all layers have a bias node.
#net.trainf = nl.train.train_gdm
In [178]:
# Build and train the network on the training data.
m = datain.shape[0] # number of observations
ci = datain.shape[1] # number of input nodes
layers = [ci,1] # One hidden layer with ci nodes
net = nl.net.newff(min_max_list,layers)
err = net.train(datain, dataout, show=2,goal=0.01,epochs=20)
net.save('myfirst_net_noage.sav')
In [180]:
# Train the network on the training data.
m_age = datain_age.shape[0] # number of observations
ci_age = datain_age.shape[1] # number of input nodes
layers_age = [ci_age,1] # One hidden layer with ci nodes
net_age = nl.net.newff(min_max_list_age,layers_age)
err_age = net_age.train(datain_age, dataout_age, show=2,goal=0.01,epochs=20)
net_age.save('myfirst_net_age.sav')
In [181]:
plt.plot(np.array(err)/len(datain),label='No Age')
plt.hold(True)
plt.plot(np.array(err_age)/len(datain_age), label='Age')
plt.legend()
Out[181]:
In [182]:
# Print fraction of results correctly modeled
trainsim = np.sign(net.sim(datain))
correct = trainsim==dataout
print "Fraction correct (no age): ",np.sum(correct)/ np.float(len(correct)), len(correct)
trainsim = np.sign(net_age.sim(datain_age))
correct = trainsim==dataout_age
print "Fraction correct (w/ age): ",np.sum(correct)/ np.float(len(correct)), len(correct)
In [183]:
test = pd.read_csv("./data/titanic/test.csv", index_col="PassengerId")
reload(NNF)
Out[183]:
In [184]:
datain_age,dataout_age,min_max_list_age,pid_age = NNF.make_input_output(test,Test=True)
In [185]:
datain,dataout,min_max_list,pid = NNF.make_input_output(test,Test=True,Age=False)
In [186]:
predict_age = np.sign(net_age.sim(datain_age))
predict_noage = np.sign(net.sim(datain))
In [187]:
predictions = np.concatenate([predict_age,predict_noage])
predictions = np.where(predictions==1,predictions,0)
passengerid = np.concatenate([pid_age,pid])
dfout = pd.DataFrame(predictions,index=passengerid,columns=['Survived'])
dfout.index.name = 'PassengerID'
dfout = dfout.astype(int)
dfout = dfout.sort()
dfout.to_csv('./predictions/Neural_Network_Prediction.csv',sep=',')